In this document, we will visualise the cleaned cricket
library(tidyverse)
## ── Attaching packages ────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(mcvis)
clean_test_batting_career = readr::read_csv("./clean_test_batting_career.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## player_name = col_character(),
## span = col_character(),
## career_start = col_double(),
## career_end = col_double(),
## mat = col_double(),
## inns = col_double(),
## not_out = col_double(),
## runs = col_double(),
## hs = col_character(),
## ave = col_double(),
## century = col_double(),
## half_century = col_double(),
## ducks = col_double(),
## fours = col_double(),
## sixes = col_double(),
## balls_faced = col_double(),
## strike_rate = col_double()
## )
clean_test_batting_inngings = readr::read_csv("./clean_test_batting_inngings.csv")
## Parsed with column specification:
## cols(
## country = col_character(),
## player_name = col_character(),
## runs = col_character(),
## mins = col_double(),
## balls_faced = col_double(),
## fours = col_double(),
## sixes = col_double(),
## strike_rate = col_double(),
## pos = col_double(),
## dismissal = col_character(),
## inns = col_double(),
## opposition = col_character(),
## ground = col_character(),
## start_date = col_character(),
## test_number = col_character()
## )
plot(clean_test_batting_career$strike_rate/100,
clean_test_batting_career$runs/clean_test_batting_career$balls_faced)
abline(a = 0, b = 1, col = "red")
plot(clean_test_batting_career$ave,
clean_test_batting_career$runs/(clean_test_batting_career$inns - clean_test_batting_career$not_out))
abline(a = 0, b = 1, col = "red")
Looking at the correlation plot, it is not clear as to which variable is the main culprit that causes multi-collinearity.
X = clean_test_batting_career %>%
dplyr::select_if(is.numeric) %>%
na.omit() %>%
dplyr::mutate(outs = inns - not_out) %>%
dplyr::select(
-career_start, -career_end,
-inns, -mat,
-not_out,
-runs) %>%
dplyr::mutate_all(.funs = ~ log10(. + 1L))
glimpse(X)
## Observations: 810
## Variables: 9
## $ ave <dbl> 1.1914510, 0.8450980, 1.4410664, 1.3802112, 0.84509…
## $ century <dbl> 0.3010300, 0.0000000, 0.3010300, 0.3010300, 0.00000…
## $ half_century <dbl> 0.0000000, 0.0000000, 0.6989700, 0.6020600, 0.00000…
## $ ducks <dbl> 0.6989700, 0.4771213, 0.6020600, 0.6020600, 1.04139…
## $ fours <dbl> 1.278754, 0.301030, 1.832509, 1.832509, 1.041393, 2…
## $ sixes <dbl> 0.0000000, 0.0000000, 0.4771213, 0.8450980, 0.30103…
## $ balls_faced <dbl> 2.628389, 2.181844, 3.199206, 2.850646, 2.294466, 3…
## $ strike_rate <dbl> 1.588047, 1.395152, 1.650890, 1.840232, 1.724604, 1…
## $ outs <dbl> 1.0791812, 0.8450980, 1.4313638, 1.3424227, 1.25527…
plot(X$ave,
X$strike_rate + X$balls_faced - X$outs - log10(100))
abline(a = 0, b = 1, col = "red")
cor(X$ave,
X$strike_rate + X$balls_faced - X$outs - log10(100))
## [1] 0.9935072
skimr::skim(X)
## Skim summary statistics
## n obs: 810
## n variables: 9
##
## ── Variable type:numeric ──────────────────────────────────
## variable missing complete n mean sd p0 p25 p50 p75 p100
## ave 0 810 810 1.3 0.29 0.3 1.1 1.36 1.52 1.81
## balls_faced 0 810 810 3.07 0.64 1.2 2.6 3.09 3.53 4.49
## century 0 810 810 0.29 0.42 0 0 0 0.57 1.66
## ducks 0 810 810 0.7 0.36 0 0.48 0.7 0.95 1.64
## fours 0 810 810 1.77 0.69 0 1.28 1.8 2.26 3.22
## half_century 0 810 810 0.54 0.53 0 0 0.48 0.95 1.83
## outs 0 810 810 1.45 0.46 0.3 1.12 1.46 1.8 2.44
## sixes 0 810 810 0.63 0.54 0 0 0.6 1 2.03
## strike_rate 0 810 810 1.63 0.14 0.97 1.57 1.65 1.72 1.95
## hist
## ▁▁▂▃▅▆▇▃
## ▁▂▅▆▇▇▅▂
## ▇▂▁▁▁▁▁▁
## ▃▅▇▆▇▅▂▁
## ▁▂▆▆▇▇▅▂
## ▇▂▂▃▂▂▁▁
## ▁▃▅▇▇▇▅▂
## ▇▅▃▅▃▂▁▁
## ▁▁▁▁▃▇▅▁
corrplot::corrplot.mixed(
cor(X),
upper = "square")
d3heatmap::d3heatmap(cor(X))
X %>%
pairs(lower.panel = CPOP::panel_cor)
m = lm(ave ~ ., data = X)
summary(m)
##
## Call:
## lm(formula = ave ~ ., data = X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.038521 -0.008527 -0.001799 0.005104 0.204354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.272851 0.023695 -53.719 < 2e-16 ***
## century 0.032785 0.002999 10.934 < 2e-16 ***
## half_century 0.037008 0.003229 11.463 < 2e-16 ***
## ducks 0.006572 0.003679 1.786 0.0744 .
## fours 0.050714 0.007768 6.528 1.18e-10 ***
## sixes 0.012127 0.002180 5.564 3.60e-08 ***
## balls_faced 0.826033 0.007218 114.448 < 2e-16 ***
## strike_rate 0.810211 0.010497 77.188 < 2e-16 ***
## outs -0.978147 0.007212 -135.635 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01807 on 801 degrees of freedom
## Multiple R-squared: 0.996, Adjusted R-squared: 0.996
## F-statistic: 2.51e+04 on 8 and 801 DF, p-value: < 2.2e-16
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
m %>%
car::vif()
## century half_century ducks fours sixes
## 3.980770 7.191969 4.315878 70.607615 3.407500
## balls_faced strike_rate outs
## 52.709086 5.223086 26.840815
mcvis method seems to point to the runs variable as the main cause of colinearity.
mcvis_result = X %>%
mcvis::mcvis(standardise_method = "studentise")
mcvis_result$MC %>% round(2)
## ave century half_century ducks fours sixes balls_faced strike_rate
## tau9 0.02 0.08 0.40 0.02 0.04 0.39 0.01 0.02
## tau8 0.05 0.04 0.34 0.16 0.06 0.11 0.04 0.14
## tau7 0.13 0.09 0.19 0.06 0.03 0.25 0.15 0.07
## tau6 0.26 0.16 0.04 0.00 0.01 0.37 0.11 0.03
## tau5 0.01 0.21 0.03 0.06 0.02 0.53 0.06 0.07
## tau4 0.03 0.14 0.76 0.01 0.01 0.01 0.01 0.02
## tau3 0.19 0.01 0.04 0.70 0.01 0.01 0.02 0.00
## tau2 0.04 0.00 0.00 0.00 0.87 0.00 0.06 0.00
## tau1 0.15 0.00 0.00 0.00 0.00 0.00 0.56 0.00
## outs
## tau9 0.02
## tau8 0.06
## tau7 0.03
## tau6 0.01
## tau5 0.01
## tau4 0.01
## tau3 0.02
## tau2 0.02
## tau1 0.28
mcvis::ggplot_mcvis(mcvis_result)
sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] car_3.0-3 carData_3.0-2 mcvis_0.2.11 forcats_0.4.0
## [5] stringr_1.4.0 dplyr_0.8.3 purrr_0.3.2 readr_1.3.1
## [9] tidyr_1.0.0 tibble_2.1.3 ggplot2_3.2.1 tidyverse_1.2.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-141 lubridate_1.7.4 doParallel_1.0.15
## [4] RColorBrewer_1.1-2 httr_1.4.1 tools_3.6.1
## [7] backports_1.1.5 utf8_1.1.4 R6_2.4.0
## [10] DT_0.9 rpart_4.1-15 lazyeval_0.2.2
## [13] colorspace_1.4-1 nnet_7.3-12 withr_2.1.2
## [16] tidyselect_0.2.5 mnormt_1.5-5 curl_4.2
## [19] compiler_3.6.1 glmnet_2.0-18 cli_1.1.0
## [22] rvest_0.3.4 CPOP_0.0.19 xml2_1.2.2
## [25] labeling_0.3 d3heatmap_0.6.1.2 slam_0.1-45
## [28] scales_1.0.0 mvtnorm_1.0-11 psych_1.8.12
## [31] proxy_0.4-23 digest_0.6.21 foreign_0.8-72
## [34] rmarkdown_1.16 rio_0.5.16 base64enc_0.1-3
## [37] pkgconfig_2.0.3 htmltools_0.4.0 htmlwidgets_1.5.1
## [40] rlang_0.4.0 readxl_1.3.1 rstudioapi_0.10
## [43] visNetwork_2.0.8 generics_0.0.2 jsonlite_1.6
## [46] zip_2.0.4 ModelMetrics_1.2.2 magrittr_1.5
## [49] Matrix_1.2-17 Rcpp_1.0.2 munsell_0.5.0
## [52] fansi_0.4.0 abind_1.4-5 lifecycle_0.1.0
## [55] stringi_1.4.3 yaml_2.2.0 MASS_7.3-51.4
## [58] plyr_1.8.4 recipes_0.1.7 grid_3.6.1
## [61] parallel_3.6.1 HDCI_1.0-2 crayon_1.3.4
## [64] lattice_0.20-38 splines_3.6.1 haven_2.1.1
## [67] hms_0.5.1 zeallot_0.1.0 knitr_1.25
## [70] pillar_1.4.2 igraph_1.2.4.1 stats4_3.6.1
## [73] reshape2_1.4.3 codetools_0.2-16 glue_1.3.1
## [76] evaluate_0.14 data.table_1.12.4 modelr_0.1.5
## [79] png_0.1-7 vctrs_0.2.0 foreach_1.4.7
## [82] cellranger_1.1.0 gtable_0.3.0 assertthat_0.2.1
## [85] openxlsx_4.1.0.1 xfun_0.10 gower_0.2.1
## [88] prodlim_2018.04.18 skimr_1.0.7 broom_0.5.2
## [91] e1071_1.7-2 class_7.3-15 survival_2.44-1.1
## [94] timeDate_3043.102 iterators_1.0.12 lava_1.6.6
## [97] corrplot_0.84 caret_6.0-84 ipred_0.9-9